Exploratory Data Analysis

Reading & Preparing the data

food_train <- read.csv("data/food_train.csv")
food_test <- read.csv("data/food_test.csv")
food_nutrients <- read.csv("data/food_nutrients.csv")
nutrients <- read.csv("data/nutrients.csv")

For convenient, I’ll change the names of the categories only for part A:

levels(food_train$category) <- c("cakes","candy", "chips", "chocolate", "cookies", "popcorn" )

Interactive Plots

Household Serving

fulltext_plotly <- data_plot_fulltext  %>% mutate( pct = paste0((round(n/sum(n)*100, 2))," %")) %>%
  ggplot(aes(x =reorder(household_serving_update, -entropy), y = n , fill = category ,
             text = paste( "Houshold serving:", household_serving_update,"\n Category:", category,"\n Count", n, "\n Percentage", pct,
                           "\n Total n", sum_n , "\n Entropy" , round(entropy,2) )))  +
 geom_col(position = "fill", color = "white", size = .3)  + coord_flip() +  theme_light() + theme(text = element_text(size=10))  +
  labs(x="", y="", title = "Keywords by Category", subtitle = "Ordered by Entropy (ascending)")  +
  scale_y_continuous(labels = percent) 

Ingredients

ingredients_plotly <- n_top_ingredient(ingredients_data_train, 1000)  %>% group_by(ingredient) %>%
  mutate(pct = paste0((round(n/sum(n)*100, 2))," %") ,sum_n = sum(n)) %>%  filter(sum_n>3000)  %>%  
  ggplot(aes(category,reorder(ingredient, sum_n)  ,fill= n,
             text = paste("Ingredient: " , ingredient , "\n Category: " ,category, "\n Count: ", n, "\n Pct: " , pct, "\n Total: " , sum_n ))) + 
  scale_fill_gradient2(low = "#F9EBEA", mid = "#CD6155", high = "#922B21", midpoint = 3000) + 
  geom_tile( ) + labs(x="", y="", title = "Top Ingredients by Category",
       subtitle = "Color by Count of Products Contain the Ingredient") + 
  theme_bw() +  theme(axis.text = element_text(size = 8))
ggplotly(ingredients_plotly, tooltip = "text")

Description

description_plotly <- top_words(food_train, "description", by_category = T)  %>% group_by(word) %>%
  mutate(n= n, sum_n = sum(n), pct = n/sum(n), entropy = entropy_fun(pct) ) %>% 
  mutate( pct = paste0((round(pct*100, 2))," %"))  %>% group_by(word) %>% filter(sum_n>750 ) %>% 
  ggplot(aes(x = reorder(word, -entropy), y = n , fill = category ,
             text = paste( "Description word:", word,"\n Category:", category,"\n Count", n, "\n Percentage", pct,
                           "\n Total n", sum_n , "\n Entropy" , round(entropy,2) )))  +
  geom_col(position = "fill", color = "white", size = .3)  + coord_flip() +  theme_light() +theme(text = element_text(size=10))  +
  labs(x="", y="", title = "Descroption Common Words by Category", subtitle = "Ordered by Entropy (Ascending)")  +
  scale_y_continuous(labels = percent) 
ggplotly(description_plotly, tooltip = "text")

Food_nutrition & Nutrients

What are the most common nutrients and their average amount? (top 5)
# For each of the units of measurement, I normalized the average amount to be between 0 and 1, so that they would be comparable.
plotly_nutr <- nutr_data2 %>% group_by(unit_name) %>% 
  mutate(normalize_mean_amount = mean_amount/max(mean_amount),name = factor(name, levels = order_nutrient)) %>%
  ggplot(aes(x = category,y = name , fill= normalize_mean_amount,
                  text = paste("Nutrient", name, "\n Category", category, "\n Average Amount", round(mean_amount,2),
                               "\n Normalizes mean", normalize_mean_amount , "\n Count", n))) + 
  geom_tile() + scale_fill_gradient(low="#CDFFF7", high="#007F5F", name = "")  + 
  theme_bw() +  theme(axis.text = element_text(size = 6.5), text = element_text(size=9),
    axis.text.x = element_text(size = 8), legend.position = "right") +
  labs(title = "Normalized Average Amount of Food Nutrition by Snack's Category",
       y = "Nutrient Name", x="", subtitle = "For each of the unit sizes - normalized average amount between 0 to 1", 
      caption = "Sorted by the most common nutrient(on top), and goes down to the less common ones")
ggplotly(plotly_nutr, tooltip = "text")